1 Info

Here I train the 16x video classifiers with the merged data, i.e. using stock culture data from every time point and light setting recorded.

2 Dependencies

library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)
library(parallel)
library(doParallel)

3 Data: load in and cleaning

3.1 Normal light

3.1.1 1st classifier data (2020-2021)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Coleps_irchel/Morph_mvt.RData")
dd25 <- morph_mvt

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Euplotes/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Euplotes_second/Morph_mvt.RData")
# morph_mvt <- morph_mvt %>% filter(mean_area>3000)
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Paramecium_bursaria/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Paramecium_caudatum/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Stylonychia_1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Stylonychia_1_second/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia1"
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Stylonychia_2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2"
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Dexiostoma/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Loxocephallus/Morph_mvt.RData")
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Tetrahymena/Morph_mvt.RData")
morph_mvt$species <- "Tetrahymena"
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Cryptomonas/Morph_mvt.RData")
morph_mvt$species <- "Cryptomonas"
dd25 <- rbind(dd25, morph_mvt)

load("../../Class_18C_normalLight/1st_class_2020/data/25x/Debris/Morph_mvt.RData")
morph_mvt$species <- "Debris_and_other"
morph_mvt <- morph_mvt %>% filter(mean_area<700)
dd25 <- full_join(dd25, morph_mvt)

# filtering
dd25 <- dd25 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)%>%
  select(-edible_algae,-microcosm.nr)

dd25$magnification <- 2.5

# table(dd25$species)

3.1.2 2nd classifier data (2022 Feb) - normal light

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Coleps_irchel/Morph_mvt.RData")
dd25_2022feb <- morph_mvt

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Colpidium/Morph_mvt.RData")
morph_mvt$species <- "Colpidium"
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Dexiostoma/Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Euplotes/Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Loxochephalus//Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Paramecium_bursaria/Morph_mvt.RData")
# morph_mvt %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   geom_vline(xintercept = 3000)
morph_mvt <- morph_mvt %>% dplyr::filter(mean_area>3000)
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Paramecium_caudatum/Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Stylonychia2_batch1/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2_batch1"
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Stylonychia2_batch2/Morph_mvt.RData")
morph_mvt$species <- "Stylonychia2_batch2"
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

load("../../Class_18C_normalLight/2nd_class_2022Feb/data/25x/Tetrahymena/Morph_mvt.RData")
dd25_2022feb <- rbind(dd25_2022feb, morph_mvt)

# filtering

dd25_2022feb <- dd25_2022feb %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_2022feb$magnification <- 2.5

# table(dd25_2022feb$species)

3.1.3 3rd data (20220301) - normal light

load("../../Class_18C_normalLight/3rd_data_20220301/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220301 <- morph_mvt

load("../../Class_18C_normalLight/3rd_data_20220301/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220301 <- rbind(dd25_20220301, morph_mvt)

# filtering

dd25_20220301 <- dd25_20220301 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220301$magnification <- 2.5

# dd25_20220301 %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd25_20220301 <- dd25_20220301 %>%
  dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes") & mean_area<2000))

# table(dd25_20220301$species)

Paramecium_caudatum maybe needs more cleaning

3.1.4 4th data (20220316) - normal light

load("../../Class_18C_normalLight/4th_data_20220316/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220316 <- morph_mvt

load("../../Class_18C_normalLight/4th_data_20220316/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220316 <- rbind(dd25_20220316, morph_mvt)

# filtering

dd25_20220316 <- dd25_20220316 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220316$magnification <- 2.5

# dd25_20220316 %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd25_20220316 <- dd25_20220316 %>%
  dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes","Stylonychia_2") & mean_area<2000))

# table(dd25_20220316$species)

3.1.5 5th data (20220406) - normal light

load("../../Class_18C_normalLight/5th_data_20220406/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220406 <- morph_mvt

load("../../Class_18C_normalLight/5th_data_20220406/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220406 <- rbind(dd25_20220406, morph_mvt)

# filtering

dd25_20220406 <- dd25_20220406 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220406$magnification <- 2.5

# dd25_20220406 %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd25_20220406 <- dd25_20220406 %>%
  dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes","Stylonychia_2") & mean_area<2000))

# table(dd25_20220406$species)

3.1.6 6th data (20220706) - normal light

load("../../Class_18C_normalLight/6th_data_20220706/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220706 <- morph_mvt

load("../../Class_18C_normalLight/6th_data_20220706/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220706 <- rbind(dd25_20220706, morph_mvt)

# filtering

dd25_20220706 <- dd25_20220706 %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220706$magnification <- 2.5

# dd25_20220706 %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd25_20220706 <- dd25_20220706 %>%
  dplyr::filter(!(species %in% c("Paramecium_caudatum","Paramecium_bursaria","Euplotes","Stylonychia_2") & mean_area<2000))

# table(dd25_20220706$species)

3.2 Decreasing light

3.2.1 2nd classifier data (2022 Feb) - 18 perc

load("../../Class_18C_decreasingLight/Light_18perc/data/25x/Euplotes/Morph_mvt.RData")
dd25_2022feb_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_18perc/data/25x/Paramecium_bursaria/Morph_mvt.RData")

a <- morph_mvt %>%
  ggplot(aes(mean_area))+
  geom_histogram()

Pburs_dark_25x <- morph_mvt %>%
  dplyr::select(mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
                sd_major,mean_ar,sd_ar,
                mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
                min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed)

set.seed(23)
Pburs_dark_25x_clust <- kmeans(Pburs_dark_25x, centers = 2, nstart = 25, iter.max = 1000)
Pburs_dark_25x$cluster <- as.factor(Pburs_dark_25x_clust$cluster)
b <- Pburs_dark_25x %>%
  ggplot(aes(mean_area, col=cluster, fill=cluster))+
  geom_histogram(position = "identity", alpha=0.3)

Pburs_dark_25x_clust <- kmeans(Pburs_dark_25x, centers = 3, nstart = 25, iter.max = 1000)
Pburs_dark_25x$cluster <- as.factor(Pburs_dark_25x_clust$cluster)
c <- Pburs_dark_25x %>%
  ggplot(aes(mean_area, col=cluster, fill=cluster))+
  geom_histogram(position = "identity", alpha=0.3)

Pburs_dark_25x_clust <- kmeans(Pburs_dark_25x, centers = 4, nstart = 25, iter.max = 1000)
Pburs_dark_25x$cluster <- as.factor(Pburs_dark_25x_clust$cluster)
d <- Pburs_dark_25x %>%
  ggplot(aes(mean_area, col=cluster, fill=cluster))+
  geom_histogram(position = "identity", alpha=0.3)

morph_mvt <- morph_mvt %>%
  dplyr::mutate(cluster=as.factor(Pburs_dark_25x_clust$cluster)) %>%
  dplyr::filter(cluster !="4") %>%
  dplyr::select(-cluster)

e <- morph_mvt %>%
  ggplot(aes(mean_area))+
  geom_histogram()

# a + b + c + d + e + plot_annotation(title = "Cleaning of P.burs dark 25x")

dd25_2022feb_decrease <- rbind(dd25_2022feb_decrease, morph_mvt)

# filtering

dd25_2022feb_decrease <- dd25_2022feb_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_2022feb_decrease$magnification <- 2.5

# table(dd25_2022feb_decrease$species)

3.2.2 3rd data (20220301) - 10 perc

load("../../Class_18C_decreasingLight/Light_10perc/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220301_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_10perc/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220301_decrease <- rbind(dd25_20220301_decrease, morph_mvt)


# filtering

dd25_20220301_decrease <- dd25_20220301_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220301_decrease$magnification <- 2.5

# dd25_20220301_decrease %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd25_20220301_decrease <- dd25_20220301_decrease %>%
  dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2500),
                !(species %in% c("Coleps_irchel") & mean_area<1000))


# table(dd25_20220301_decrease$species)

3.2.3 4th data (20220316) - 6 perc

load("../../Class_18C_decreasingLight/Light_6perc/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220316_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_6perc/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220316_decrease <- rbind(dd25_20220316_decrease, morph_mvt)

# filtering

dd25_20220316_decrease <- dd25_20220316_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220316_decrease$magnification <- 2.5

# dd25_20220316_decrease %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd25_20220316_decrease <- dd25_20220316_decrease %>%
  dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000))

# table(dd25_20220316_decrease$species)

3.2.4 5th data (20220406) - 1 perc

load("../../Class_18C_decreasingLight/Light_1perc/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220406_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_1perc/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220406_decrease <- rbind(dd25_20220406_decrease, morph_mvt)

# filtering

dd25_20220406_decrease <- dd25_20220406_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220406_decrease$magnification <- 2.5

# dd25_20220406_decrease %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd25_20220406_decrease <- dd25_20220406_decrease %>%
  dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000))

# table(dd25_20220406_decrease$species)

3.2.5 6th data (20220706) - 1 perc

load("../../Class_18C_decreasingLight/Light_1perc/data_20220706/data/25x/large_ciliates/Morph_mvt.RData")
dd25_20220706_decrease <- morph_mvt

load("../../Class_18C_decreasingLight/Light_1perc/data_20220706/data/25x/small_ciliates/Morph_mvt.RData")
dd25_20220706_decrease <- rbind(dd25_20220706_decrease, morph_mvt)

# filtering

dd25_20220706_decrease <- dd25_20220706_decrease %>%
  filter(net_disp>50, net_speed>5, N_frames>10)
dd25_20220706_decrease$magnification <- 2.5

# dd25_20220706_decrease %>%
#   ggplot(aes(mean_area))+
#   geom_histogram()+
#   facet_wrap(~species, scales = "free")

dd25_20220706_decrease <- dd25_20220706_decrease %>%
  dplyr::filter(!(species %in% c("Paramecium_bursaria","Euplotes","Paramecium_caudatum") & mean_area<2000))

# table(dd25_20220706_decrease$species)

4 Further data Processing

4.1 Merge data

dd25$data <- "Late 2020"
dd25_2022feb$data <- "20220201"
dd25_2022feb_decrease$data <- "20220201"
dd25_20220301$data <- "20220301"
dd25_20220301_decrease$data <- "20220301"
dd25_20220316$data <- "20220316"
dd25_20220316_decrease$data <- "20220316"
dd25_20220406$data <- "20220406"
dd25_20220406_decrease$data <- "20220406"
dd25_20220706$data <- "20220706"
dd25_20220706_decrease$data <- "20220706"

dd_30perc <- rbind(dd25,
                   dd25_2022feb,dd25_20220301 %>% dplyr::select(-Light_dark),
                   dd25_20220316 %>% dplyr::select(-Light_dark),
                   dd25_20220406 %>% dplyr::select(-Light_dark),
                   dd25_20220706 %>% dplyr::select(-Light_dark))
dd_18perc <- dd25_2022feb_decrease
dd_10perc <- dd25_20220301_decrease %>%
  dplyr::select(-Light_dark)
dd_6perc <- dd25_20220316_decrease %>%
  dplyr::select(-Light_dark)
dd_1perc <- rbind(dd25_20220406_decrease %>% dplyr::select(-Light_dark),
                  dd25_20220706_decrease %>% dplyr::select(-Light_dark))

dd_30perc$light <- "30%"
dd_18perc$light <- "18%"
dd_10perc$light <- "10%"
dd_6perc$light <- "6%"
dd_1perc$light <- "1%"

dd <- rbind(dd_30perc,dd_18perc,dd_10perc,dd_6perc,dd_1perc)
dd$species <- ifelse(dd$species == "Stylonychia_2", "Stylonychia2", dd$species)
dd$species <- ifelse(dd$species %in% c("Stylonychia2_batch1","Stylonychia2_batch2"), "Stylonychia2", dd$species)

# table(dd$species, dd$data, dd$magnification, dd$light)

4.2 Filtering out outliers

If an individual is an outlier in more than 3 traits it gets excluded (about 7% gets excluded). Outlier based on boxplot definition.

dd$id <- 1:nrow(dd)
dd <- dd %>% na.omit()
dd_long <- dd %>%
  dplyr::select(species, mean_area,sd_area,mean_perimeter,sd_perimeter,mean_major,
                sd_major,mean_ar,sd_ar,
                mean_turning,sd_turning,gross_disp,max_net,net_disp,net_speed,max_step,
                min_step,sd_step,sd_gross_speed,max_gross_speed,min_gross_speed, id,
                data, light, magnification) %>%
  pivot_longer(cols=-c(id,species,data,light,magnification), names_to="variable") %>%
  dplyr::group_by(variable,species,data,light,magnification) %>%
  mutate(iqr = IQR(value),
         first_quartile = quantile(value, probs = 0.25),
         third_quartile = quantile(value, probs = 0.75),
         outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))

outliers <- dd_long %>%
  dplyr::filter(outlier==T) %>%
  dplyr::group_by(species, id, data, light, magnification) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::filter(n>3)

# table(outliers$species)
# nrow(outliers)/nrow(dd)

dd_filtered <- dd %>%
  dplyr::filter(!is.element(id,outliers$id))

dd$removed_outliers <- F
dd_filtered$removed_outliers <- T

dd_comparison <- rbind(dd,dd_filtered)

dd <- dd_filtered 

# dd_comparison %>%
#   ggplot(aes((mean_area), col=removed_outliers))+
#   geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
#   # geom_boxplot(outlier.alpha = 0.3) +
#   theme_bw() +
#   facet_wrap(species~interaction(light,data), scales = "free")
#  
# dd_filtered %>%
#   ggplot(aes(log10(mean_area)))+
#   geom_density(aes(col=species))

# table(dd$species, dd$data, dd$magnification, dd$light)

4.3 Sub-sampling to create training data

We have data from several time points and light settings. The goals is that in the final training data these are somewhat equally represented, regardless that the number of individuals tracked per species and time point can vary a lot (in other words: “even it out” across species, time point and light setting). This is important, as otherwise it can be that groups that are more abundant are weighted more.

dd <- dd %>%
  mutate(data.light = interaction(data,light, drop = T),
         data.light.species=interaction(data,light,species, drop = T))

print("number of individuals per species per date and light setting")
## [1] "number of individuals per species per date and light setting"
tab <- table(dd$data.light, dd$species)
tab
##                
##                 Coleps_irchel Colpidium Cryptomonas Debris_and_other Dexiostoma
##   20220406.1%              15       154           0                0         15
##   20220706.1%               6       178           0                0        203
##   20220301.10%            614       184           0                0        342
##   20220201.18%              0         0           0                0          0
##   20220201.30%            904       330           0                0        259
##   20220301.30%            587       193           0                0        317
##   20220316.30%             91       234           0                0        199
##   20220406.30%             73        93           0                0        100
##   20220706.30%            177       163           0                0        141
##   Late 2020.30%           531       155        3475              623        649
##   20220316.6%             147       212           0                0        120
##                
##                 Euplotes Loxocephallus Paramecium_bursaria Paramecium_caudatum
##   20220406.1%         35           362                 118                 137
##   20220706.1%         75           602                  96                  59
##   20220301.10%        33          1041                  87                 370
##   20220201.18%       708             0                1230                   0
##   20220201.30%       228           967                 488                 445
##   20220301.30%        55           834                 122                 564
##   20220316.30%       136           687                 860                 786
##   20220406.30%       148           403                 587                 329
##   20220706.30%       411           653                1238                 405
##   Late 2020.30%      836          1090                1342                1017
##   20220316.6%        354           695                 751                 780
##                
##                 Stylonychia1 Stylonychia2 Tetrahymena
##   20220406.1%              0          148         304
##   20220706.1%              0           18        1037
##   20220301.10%             0          448        1027
##   20220201.18%             0            0           0
##   20220201.30%             0          538        1035
##   20220301.30%             0          942         974
##   20220316.30%             0          642        1086
##   20220406.30%             0          625         554
##   20220706.30%             0         1592         968
##   Late 2020.30%          345          146        1176
##   20220316.6%              0          469         676
print("Sum of individuals per species")
## [1] "Sum of individuals per species"
colsums <- colSums(tab)
colsums
##       Coleps_irchel           Colpidium         Cryptomonas    Debris_and_other 
##                3145                1896                3475                 623 
##          Dexiostoma            Euplotes       Loxocephallus Paramecium_bursaria 
##                2345                3019                7334                6919 
## Paramecium_caudatum        Stylonychia1        Stylonychia2         Tetrahymena 
##                4892                 345                5568                8837

Of course, Stylo1 is the one we the least individuals, because it went extinct and we do not have it anymore. As it went extinct, it shouldn’t be the limiting factor, I think. For now at least I’m taking 3 * Stylo1_sum as the number of individuals per species to be included (if a species/class has less than that all individuals are included).

Within a class I use that number so that roughly equally many individuals across time steps and light settings are included. As a last thing I divide the data into a training dataset and in a test dataset.

n <- min(colsums)*4
print(paste0("Max number of individuals used per species (if there are fewer for a species, then all are used): n=",n))
## [1] "Max number of individuals used per species (if there are fewer for a species, then all are used): n=1380"
num_ind_per_class <- dd %>% group_by(species) %>% 
  summarize(num_class = length(unique(data.light.species)),
            num_ind_per_class = floor(n/num_class)) %>%
  select(species,num_ind_per_class)

num_ind_per_class_vec <- c(num_ind_per_class$num_ind_per_class)
names(num_ind_per_class_vec) <- num_ind_per_class$species

set.seed(53)

split_up <- split(dd, f = dd$data.light.species)
split_up <- lapply(split_up, function(x) {
  species <- unique(x$species)
  x %>% sample_n(ifelse(nrow(x) < num_ind_per_class_vec[species], nrow(x), num_ind_per_class_vec[species]))})
trainingdata <- do.call("rbind", split_up)

print("Subsampled: number of individuals per species per date and light setting")
## [1] "Subsampled: number of individuals per species per date and light setting"
tab2 <- table(trainingdata$data.light, trainingdata$species)
tab2
##                
##                 Coleps_irchel Colpidium Cryptomonas Debris_and_other Dexiostoma
##   20220406.1%              15       138           0                0         15
##   20220706.1%               6       138           0                0        138
##   20220301.10%            138       138           0                0        138
##   20220201.18%              0         0           0                0          0
##   20220201.30%            138       138           0                0        138
##   20220301.30%            138       138           0                0        138
##   20220316.30%             91       138           0                0        138
##   20220406.30%             73        93           0                0        100
##   20220706.30%            138       138           0                0        138
##   Late 2020.30%           138       138        1380              623        138
##   20220316.6%             138       138           0                0        120
##                
##                 Euplotes Loxocephallus Paramecium_bursaria Paramecium_caudatum
##   20220406.1%         35           138                 118                 137
##   20220706.1%         75           138                  96                  59
##   20220301.10%        33           138                  87                 138
##   20220201.18%       125             0                 125                   0
##   20220201.30%       125           138                 125                 138
##   20220301.30%        55           138                 122                 138
##   20220316.30%       125           138                 125                 138
##   20220406.30%       125           138                 125                 138
##   20220706.30%       125           138                 125                 138
##   Late 2020.30%      125           138                 125                 138
##   20220316.6%        125           138                 125                 138
##                
##                 Stylonychia1 Stylonychia2 Tetrahymena
##   20220406.1%              0          138         138
##   20220706.1%              0           18         138
##   20220301.10%             0          138         138
##   20220201.18%             0            0           0
##   20220201.30%             0          138         138
##   20220301.30%             0          138         138
##   20220316.30%             0          138         138
##   20220406.30%             0          138         138
##   20220706.30%             0          138         138
##   Late 2020.30%          345          138         138
##   20220316.6%              0          138         138
print("Subsampled: Sum of individuals per species")
## [1] "Subsampled: Sum of individuals per species"
colsums2 <- colSums(tab2)
colsums2
##       Coleps_irchel           Colpidium         Cryptomonas    Debris_and_other 
##                1013                1335                1380                 623 
##          Dexiostoma            Euplotes       Loxocephallus Paramecium_bursaria 
##                1201                1073                1380                1298 
## Paramecium_caudatum        Stylonychia1        Stylonychia2         Tetrahymena 
##                1300                 345                1260                1380
trainingdata <- trainingdata[complete.cases(trainingdata),]

# A stratified random split of the data
idx_train <- createDataPartition(trainingdata$species,
                                 p = 0.65, # percentage of data as training
                                 list = FALSE)

testdata <- trainingdata[-idx_train,]
trainingdata <- trainingdata[idx_train,]

5 Loading in species compositions

species <- unique(dd$species)
species <- species[!is.element(species,c("Debris_and_other","Cryptomonas"))]
compositions <- read_csv("../../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition

compositions <- compositions[,species]

compositions_list <- apply(compositions, 1, function(x){
  idx <- which(x==1)
  names(idx)
})

names(compositions_list) <- comp_name

## filterting out Stylo1
compositions_list <- lapply(compositions_list, function(x){
  x[x!="Stylonychia1"]
})

6 Classifiers

6.1 Classifier plotting functions

classifier_plot_svm <- function(table, combination.nr){
  # cm <- classifier$confusion
  cm <- table
  ncol <- ncol(cm)
  cm <- apply(cm, 1, function(x) x/sum(x))
  
  cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
                        Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
                        Classification = as.vector(cm))
  
  plot <- cm_long %>%
    ggplot(aes(Predicted,Truth,fill=Classification))+
    geom_tile() +
    geom_text(aes(label = round(Classification, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
    scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
    theme(axis.text.x = element_text(angle = 90, hjust = 0))+
    theme(legend.text = element_text(size=14),
          axis.title = element_text(size=14),
          title = element_text(size=16),
          axis.text = element_text(size=13))+
    labs(title=paste("PPV of",combination.nr),fill="PPV")
  
  return(plot)
}


classifier_assessment_plot <- function(cf, combination.nr){

  cf.df <- cf$byClass[,1:4] %>%
    as.data.frame() %>%
    mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
    rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
           "Sens" = "Sensitivity", "Spec" = "Specificity") %>%
    pivot_longer(cols = 1:4) %>%
    mutate(col = ifelse(value>=0.9,"1",
                        ifelse(value<0.9 & value>=0.8,"2",
                               ifelse(value<0.8 & value>=0.7,"3","4"))))

  plot <- cf.df %>%
    ggplot(aes(name,Species,fill=col))+
    geom_tile() +
    geom_text(aes(label = round(value, 3)), col="white") +
    scale_x_discrete(position = "top") +
    scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
    scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
    theme(legend.text = element_text(size=14),
          title = element_text(size = 16),
          axis.title = element_blank(),
          axis.text = element_text(size=13),
          legend.position = "none")+
    labs(title=combination.nr, fill="")
  
  return(plot)
}

6.2 Training of classifiers

formula <- factor(species) ~ mean_area + sd_area + mean_perimeter + sd_perimeter + mean_major +
        sd_major +  mean_ar + sd_ar +
        mean_turning + sd_turning + gross_disp + max_net + net_disp + net_speed + max_step +
        min_step + sd_step + sd_gross_speed + max_gross_speed + min_gross_speed

set.seed(474)
# classifiers_18c_25x <- list()
# classifiers_18c_25x_plots <- list() 
# classifiers_18c_25x_assess_plots <- list()
# classified_test_data <- list()

completeList <- mclapply(1:length(compositions_list), function(i){
  sub_trainingdata <- trainingdata %>%
    filter(species %in% c(compositions_list[[i]],"Cryptomonas","Debris_and_other"))
  sub_testdata <- testdata %>%
    filter(species %in% c(compositions_list[[i]],"Cryptomonas","Debris_and_other"))
  
  sub_trainingdata$species <- factor(sub_trainingdata$species)
  sub_testdata$species <- factor(sub_testdata$species)

  obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
            ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
            tunecontrol = tune.control(sampling = "fix", best.model = T))
  
  classifiers_18c_25x <- obj$best.model
  
  cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                       sub_testdata$species)
  
  sub_testdata <- sub_testdata %>%
    dplyr::mutate(predicted = predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
                  correct = species==predicted)
  
  sub_testdata_summ <- sub_testdata %>% 
    group_by(data, species, light, magnification, correct) %>%
    summarize(n = n()) %>%
    dplyr::mutate(perc_corr = n/sum(n),
                  composition = names(compositions_list)[i]) %>%
    dplyr::filter(correct==T)
  
  classified_test_data <- sub_testdata_summ
  
  classifiers_18c_25x_plots <- classifier_plot_svm(table = cf$table,
                                                        combination.nr = names(compositions_list)[[i]])
  
  classifiers_18c_25x_assess_plots <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
  
  list(classifiers_18c_25x, classified_test_data, classifiers_18c_25x_plots, classifiers_18c_25x_assess_plots)
}, mc.cores = detectCores()-3)

classifiers_18c_25x <- map(completeList, 1)
classified_test_data <- map(completeList, 2)
classifiers_18c_25x_plots <- map(completeList, 3)
classifiers_18c_25x_assess_plots <- map(completeList, 4)

classified_test_data <- do.call("rbind", classified_test_data) 

names(classifiers_18c_25x_plots) <- names(classifiers_18c_25x) <-
  names(classifiers_18c_25x_assess_plots) <- comp_name

6.3 Assessing of classifiers

There are mainly 4 measures:

  • Sensitivity: the probability that an individuals is classified as species X given that the it is of species X: P(Classified as species X | is of species X)

  • Specificity: the probability that an individuals is not classified as species X given that it is not of species X: P(Not classified as species X | is not of species X)

  • Positive Predictive Value PPV: the probability that an individual is of species X given that it has been classified as species X: P(is of species X | is classified as species X)

  • Negative Predictive Value NPV: the probability that an individuals is not of species X given that it has not been classified as species X: P(is not of species X | has not been classified as species X)

PPV is the most important for us!

library(patchwork)
for(i in 1:length(classifiers_18c_25x_plots)){
  print(classifiers_18c_25x_plots[[i]] + classifiers_18c_25x_assess_plots[[i]] + 
    plot_layout(widths = c(4,2)))
}

classified_test_data <- classified_test_data%>%
  dplyr::mutate(data=ifelse(data=="Late 2020","2020late",data),
                tot_n = n/perc_corr, data=factor(data, ordered = T,
                                                 levels=c("2020late","20220201","20220301","20220316","20220406","20220706")),
                light_num = as.numeric(gsub("%","",light)))

# classified_test_data %>%
#   dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
#   ggplot(aes(data, perc_corr, col=composition, shape=light, size=tot_n)) +
#   geom_hline(yintercept = 0.8, col="red") +
#   geom_hline(yintercept = 0.9, col="green")+
#   geom_point(position = position_jitter(0.2))+
#   facet_wrap(~species, nrow = 3)+
#   theme_bw() +
#   scale_size_continuous(breaks = c(1,10,20,30)) +
#   labs(size="Sample size of evaluation data", y="Correct identification in percentage", x="Date of data collection",
#        title="Species identification across date, light intensities and compositions")

classified_test_data %>%
  dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
  ggplot(aes(light_num, perc_corr))+
  geom_hline(yintercept = 0.8, col="red") +
  geom_point(aes(fill=composition,size=tot_n),shape=21,col="black",alpha=0.3)+
  facet_wrap(~species, nrow = 3)+
  scale_size_continuous(breaks = c(1,10,20,30)) +
  geom_smooth(method = "lm")+
  theme_bw() +
  labs(size="Sample size of evaluation data", y="Correct species identification in percentage", x="Light intensity in percentage",
       title="Species identification across light intensities and compositions")
## `geom_smooth()` using formula 'y ~ x'

# classified_test_data %>%
#   dplyr::filter(!(species %in% c("Cryptomonas","Debris_and_other"))) %>%
#   ggplot(aes(tot_n,perc_corr, col=species))+
#   geom_point()+
#   theme_bw()

7 Save classifiers

saveRDS(classifiers_18c_25x, "svm_video_classifiers_18c_25x_20220706_MergedData.rds")
# cl <- readRDS("classifiers_18c_25x.rds")